This R Markdown document reads, combines, and summarizes multiple
_summary.csv files. ***
0. Packages
This section loads the packages used for import, manipulation, and
plotting.
tidyverse provides dplyr,
readr, purrr, stringr, and
ggplot2.
library(tidyverse)
# Make plots larger and labels more legible throughout the report
knitr::opts_chunk$set(
fig.width = 16,
fig.height = 8,
dpi = 200,
out.width = "100%",
fig.align = "center"
)
# Ensure rmarkdown can find Pandoc.
# On Windows, Quarto bundles pandoc in: <quarto>/bin/tools/pandoc.exe.
if (!nzchar(Sys.which("pandoc"))) {
quarto_exe <- Sys.which("quarto")
if (nzchar(quarto_exe)) {
quarto_tools <- normalizePath(
file.path(dirname(quarto_exe), "tools"),
winslash = "/",
mustWork = FALSE
)
if (dir.exists(quarto_tools)) {
Sys.setenv(RSTUDIO_PANDOC = quarto_tools)
}
}
}
1. Read and combine ALL CSVs
Reads all _summary.csv files from folder
and combines them into a single master table dat_raw.
folder: directory containing summary CSVs
files: all matching file paths
raw_list: list of per-file data frames
dat_raw: combined data frame
folders <- c(
"C:\\Users\\dunnmk\\University of Michigan Dropbox\\MED-WILSONTELAB\\wilsontelab box\\Common\\Projects\\Yeast Aim 3\\3C Sequencing Data\\3C_summary",
"C:\\Users\\dunnmk\\University of Michigan Dropbox\\MED-WILSONTELAB\\wilsontelab box\\Common\\Projects\\Yeast Aim 3\\Sequencing\\PD Data"
)
files <- unlist(lapply(
folders,
function(folder) list.files(
folder,
pattern = "(_summary|_collapsed_summary)\\.csv$",
full.names = TRUE
)
))
raw_list <- lapply(files, function(f) {
dat <- read_csv(f)
if (!("repeat" %in% names(dat))) {
dat <- dat %>% mutate(`repeat` = "ALL")
}
dat %>%
mutate(
source_file = basename(f),
`repeat` = as.character(`repeat`)
)
})
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 153 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 184 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 139 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 183 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat_raw <- bind_rows(raw_list)
dat_raw <- dat_raw %>%
mutate(batch = factor(batch, levels = sort(unique(batch))))
str(dat_raw)
## tibble [1,107 × 12] (S3: tbl_df/tbl/data.frame)
## $ batch : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 1 1 1 1 ...
## $ DSB2_loci : chr [1:1107] "chr7" "chr7" "chr7" "chr7" ...
## $ time_point : num [1:1107] 0 0 0 0 0 0 0 0 0 0 ...
## $ replicate : num [1:1107] 1 1 1 1 1 1 1 1 1 1 ...
## $ alignment_name: chr [1:1107] "CIS_A_to_B_DSB1_Chr12_L01" "CIS_A_to_B_DSB1_Chr12_L05" "CIS_A_to_B_DSB1_Chr12_L07" "CIS_A_to_B_DSB1_Chr12_L12" ...
## $ cis_trans : chr [1:1107] "CIS" "CIS" "CIS" "CIS" ...
## $ DSB : chr [1:1107] "DSB1" "DSB1" "DSB1" "DSB1" ...
## $ combo : chr [1:1107] "A_to_B" "A_to_B" "A_to_B" "A_to_B" ...
## $ allele : chr [1:1107] "Chr12_L01" "Chr12_L05" "Chr12_L07" "Chr12_L12" ...
## $ count : num [1:1107] 23365 15119 19481 22779 27868 ...
## $ repeat : chr [1:1107] "ALL" "ALL" "ALL" "ALL" ...
## $ source_file : chr [1:1107] "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" ...
dat_raw |> head(10) |> knitr::kable(caption = "Preview: combined raw `_summary.csv` rows (first 10)")
Preview: combined raw _summary.csv rows (first
10)
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr12_L01 |
CIS |
DSB1 |
A_to_B |
Chr12_L01 |
23365 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr12_L05 |
CIS |
DSB1 |
A_to_B |
Chr12_L05 |
15119 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr12_L07 |
CIS |
DSB1 |
A_to_B |
Chr12_L07 |
19481 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr12_L12 |
CIS |
DSB1 |
A_to_B |
Chr12_L12 |
22779 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr15_L01 |
CIS |
DSB1 |
A_to_B |
Chr15_L01 |
27868 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr15_L04 |
CIS |
DSB1 |
A_to_B |
Chr15_L04 |
54 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr15_L15 |
CIS |
DSB1 |
A_to_B |
Chr15_L15 |
14456 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr15_L17_2 |
CIS |
DSB1 |
A_to_B |
Chr15_L17_2 |
28338 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr15_L18 |
CIS |
DSB1 |
A_to_B |
Chr15_L18 |
4689 |
ALL |
batch4_T0_3C_summary.csv |
| 4 |
chr7 |
0 |
1 |
CIS_A_to_B_DSB1_Chr3_L02 |
CIS |
DSB1 |
A_to_B |
Chr3_L02 |
26669 |
ALL |
batch4_T0_3C_summary.csv |
2. Aggregate counts and percentages
Purpose- Calculate percentages from aggregated counts
summarize_counts_pct <- function(dat){
dat %>%
summarise(
Total_Counts = sum(count),
Cis_Counts = sum(count[cis_trans == "CIS"]),
Trans_Counts = sum(count[cis_trans == "TRANS"]),
A_to_B_Counts = sum(count[combo == "A_to_B"]),
C_to_D_Counts = sum(count[combo == "C_to_D"]),
A_to_D_Counts = sum(count[combo == "A_to_D"]),
C_to_B_Counts = sum(count[combo == "C_to_B"]),
Percent_Cis = if_else(Total_Counts > 0, 100 * Cis_Counts / Total_Counts, NA_real_),
Percent_Trans = if_else(Total_Counts > 0, 100 * Trans_Counts / Total_Counts, NA_real_),
Percent_A_to_B = if_else(Total_Counts > 0, 100 * A_to_B_Counts / Total_Counts, NA_real_),
Percent_C_to_D = if_else(Total_Counts > 0, 100 * C_to_D_Counts / Total_Counts, NA_real_),
Percent_A_to_D = if_else(Total_Counts > 0, 100 * A_to_D_Counts / Total_Counts, NA_real_),
Percent_C_to_B = if_else(Total_Counts > 0, 100 * C_to_B_Counts / Total_Counts, NA_real_),
.groups = "drop"
)
}
dat_agg_counts <- dat_raw %>%
group_by(batch, time_point, DSB) %>%
summarize_counts_pct()
str(dat_agg_counts)
## tibble [24 × 16] (S3: tbl_df/tbl/data.frame)
## $ batch : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 2 2 2 2 ...
## $ time_point : num [1:24] 0 0 0 120 120 120 0 0 0 120 ...
## $ DSB : chr [1:24] "DSB1" "DSB2" "TRANS" "DSB1" ...
## $ Total_Counts : num [1:24] 499775 458690 3754 538320 483621 ...
## $ Cis_Counts : num [1:24] 499775 458690 0 538320 483621 ...
## $ Trans_Counts : num [1:24] 0 0 3754 0 0 ...
## $ A_to_B_Counts : num [1:24] 499775 0 0 538320 0 ...
## $ C_to_D_Counts : num [1:24] 0 458690 0 0 483621 ...
## $ A_to_D_Counts : num [1:24] 0 0 3061 0 0 ...
## $ C_to_B_Counts : num [1:24] 0 0 693 0 0 ...
## $ Percent_Cis : num [1:24] 100 100 0 100 100 0 100 100 0 100 ...
## $ Percent_Trans : num [1:24] 0 0 100 0 0 100 0 0 100 0 ...
## $ Percent_A_to_B: num [1:24] 100 0 0 100 0 0 100 0 0 100 ...
## $ Percent_C_to_D: num [1:24] 0 100 0 0 100 0 0 100 0 0 ...
## $ Percent_A_to_D: num [1:24] 0 0 81.5 0 0 ...
## $ Percent_C_to_B: num [1:24] 0 0 18.5 0 0 ...
dat_agg_counts |> head(10) |> knitr::kable(caption = "Aggregated counts + derived percentages by batch × time_point × DSB (first 10)")
Aggregated counts + derived percentages by batch × time_point ×
DSB (first 10)
| 4 |
0 |
DSB1 |
499775 |
499775 |
0 |
499775 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| 4 |
0 |
DSB2 |
458690 |
458690 |
0 |
0 |
458690 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| 4 |
0 |
TRANS |
3754 |
0 |
3754 |
0 |
0 |
3061 |
693 |
0 |
100 |
0 |
0 |
81.53969 |
18.46031 |
| 4 |
120 |
DSB1 |
538320 |
538320 |
0 |
538320 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| 4 |
120 |
DSB2 |
483621 |
483621 |
0 |
0 |
483621 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| 4 |
120 |
TRANS |
31673 |
0 |
31673 |
0 |
0 |
18323 |
13350 |
0 |
100 |
0 |
0 |
57.85054 |
42.14946 |
| 6 |
0 |
DSB1 |
1461335 |
1461335 |
0 |
1461335 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| 6 |
0 |
DSB2 |
1945011 |
1945011 |
0 |
0 |
1945011 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| 6 |
0 |
TRANS |
18616 |
0 |
18616 |
0 |
0 |
15477 |
3139 |
0 |
100 |
0 |
0 |
83.13816 |
16.86184 |
| 6 |
120 |
DSB1 |
169434 |
169434 |
0 |
169434 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
The table above is now aggregated to batch × time_point × DSB.
***
2.5 Batch-level QC plots (counts + composition)
Purpose- Quick sanity-check plots before allele-level
cis/trans normalization.
# Aggregate from the master raw table
agg_qc <- dat_raw %>%
group_by(batch, time_point, DSB) %>%
summarize_counts_pct()
# From here on in QC, compute CIS/TRANS using ONLY the 4 combos of interest:
# CIS := A_to_B + C_to_D
# TRANS := A_to_D + C_to_B
# and define percents relative to the total of these 4 combos.
#
# IMPORTANT: we intentionally DO NOT group by `DSB` here.
# In this dataset, `DSB` can include levels like "TRANS", and grouping by it
# will split CIS and TRANS into different facets and produce misleading 100% bars.
agg_qc_combo4 <- dat_raw %>%
group_by(batch, time_point) %>%
summarise(
Total_All = sum(count),
A_to_B = sum(count[combo == "A_to_B"]),
C_to_D = sum(count[combo == "C_to_D"]),
A_to_D = sum(count[combo == "A_to_D"]),
C_to_B = sum(count[combo == "C_to_B"]),
.groups = "drop"
) %>%
mutate(
Cis_Counts = A_to_B + C_to_D,
Trans_Counts = A_to_D + C_to_B,
Total_4Combos = Cis_Counts + Trans_Counts,
Excluded_Counts = pmax(Total_All - Total_4Combos, 0),
Percent_Cis = if_else(Total_4Combos > 0, 100 * Cis_Counts / Total_4Combos, NA_real_),
Percent_Trans = if_else(Total_4Combos > 0, 100 * Trans_Counts / Total_4Combos, NA_real_),
Percent_A_to_B_in_Cis = if_else(Cis_Counts > 0, 100 * A_to_B / Cis_Counts, NA_real_),
Percent_C_to_D_in_Cis = if_else(Cis_Counts > 0, 100 * C_to_D / Cis_Counts, NA_real_),
Percent_A_to_D_in_Trans = if_else(Trans_Counts > 0, 100 * A_to_D / Trans_Counts, NA_real_),
Percent_C_to_B_in_Trans = if_else(Trans_Counts > 0, 100 * C_to_B / Trans_Counts, NA_real_)
)
# Quick diagnostic table: if Excluded_Counts is large, then the 4-combo definition is omitting real counts.
agg_qc_combo4 %>%
arrange(time_point, batch) %>%
transmute(
batch, time_point,
Total_All,
Total_4Combos,
Excluded_Counts,
Percent_Cis,
Percent_Trans
) %>%
head(20) %>%
knitr::kable(caption = "QC (4-combo definition): totals vs excluded counts, plus CIS%/TRANS% (first 20 rows)")
QC (4-combo definition): totals vs excluded counts, plus
CIS%/TRANS% (first 20 rows)
| 4 |
0 |
962219 |
962219 |
0 |
99.60986 |
0.3901399 |
| 6 |
0 |
3424962 |
3424962 |
0 |
99.45646 |
0.5435389 |
| 8 |
0 |
1946822 |
1946822 |
0 |
99.18847 |
0.8115277 |
| 4 |
120 |
1053614 |
1053614 |
0 |
96.99387 |
3.0061294 |
| 6 |
120 |
253131 |
253131 |
0 |
95.00496 |
4.9950421 |
| 8 |
120 |
195628 |
195628 |
0 |
95.88300 |
4.1169976 |
| 6 |
180 |
3539160 |
3539160 |
0 |
93.49416 |
6.5058375 |
| 8 |
180 |
3492432 |
3492432 |
0 |
92.71874 |
7.2812584 |
# Formatting helpers (avoid extra package dependencies)
comma_label <- function(x) {
ifelse(is.na(x), NA_character_, formatC(x, format = "f", digits = 0, big.mark = ","))
}
pct_label <- function(x, digits = 1) {
ifelse(is.na(x), NA_character_, paste0(round(x, digits), "%"))
}
# 1) Total counts by batch
p_total_counts <- ggplot(agg_qc, aes(x = batch, y = Total_Counts, fill = DSB)) +
geom_col(position = position_dodge(width = 0.8), width = 0.75) +
geom_text(
aes(label = comma_label(Total_Counts)),
position = position_dodge(width = 0.8),
vjust = -0.25,
size = 2.7
) +
facet_wrap(~ time_point, scales = "free_y") +
theme_bw() +
labs(
title = "Total counts by batch (faceted by time_point)",
x = "Batch",
y = "Total counts",
fill = "DSB"
)
if (nrow(agg_qc) > 0) {
print(p_total_counts)
}

# 2) CIS-only counts by category within each batch (4-combo definition)
cis_counts_long <- agg_qc_combo4 %>%
select(batch, time_point, Cis_Counts, A_to_B, C_to_D) %>%
pivot_longer(
cols = -c(batch, time_point),
names_to = "Metric",
values_to = "Counts"
) %>%
mutate(
Metric = factor(
Metric,
levels = c("Cis_Counts", "A_to_B", "C_to_D"),
labels = c("CIS (total)", "A to B (cis)", "C to D (cis)")
)
)
p_cis_counts <- ggplot(cis_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = comma_label(Counts)),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 2.4
) +
facet_wrap(~ time_point, scales = "free_y") +
theme_bw() +
labs(
title = "CIS counts by category per batch",
x = "Batch",
y = "Counts",
fill = ""
)
if (nrow(cis_counts_long) > 0) {
print(p_cis_counts)
}

# 3) TRANS-only counts by category within each batch (4-combo definition)
trans_counts_long <- agg_qc_combo4 %>%
select(batch, time_point, Trans_Counts, A_to_D, C_to_B) %>%
pivot_longer(
cols = -c(batch, time_point),
names_to = "Metric",
values_to = "Counts"
) %>%
mutate(
Metric = factor(
Metric,
levels = c("Trans_Counts", "A_to_D", "C_to_B"),
labels = c("TRANS (total)", "A to D (trans)", "C to B (trans)")
)
)
p_trans_counts <- ggplot(trans_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = comma_label(Counts)),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 2.4
) +
facet_wrap(~ time_point, scales = "free_y") +
theme_bw() +
labs(
title = "TRANS counts by category per batch",
x = "Batch",
y = "Counts",
fill = ""
)
if (nrow(trans_counts_long) > 0) {
print(p_trans_counts)
}

# 4) CIS vs TRANS percent for each batch (single plot; bars side-by-side)
# Percent_Cis/Percent_Trans are computed relative to TOTAL_4Combos.
pct_cis_trans_long <- agg_qc_combo4 %>%
select(batch, time_point, Percent_Cis, Percent_Trans) %>%
pivot_longer(
cols = c(Percent_Cis, Percent_Trans),
names_to = "Class",
values_to = "Percent"
) %>%
mutate(
Class = recode(Class, Percent_Cis = "CIS", Percent_Trans = "TRANS"),
Label = pct_label(Percent)
)
p_pct_cis_trans <- ggplot(pct_cis_trans_long, aes(x = batch, y = Percent, fill = Class)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = Label),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 3.2
) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 110)) +
theme_bw() +
labs(
title = "QC: CIS vs TRANS percent per batch (within group)",
x = "Batch",
y = "% of total counts",
fill = ""
)
if (nrow(pct_cis_trans_long) > 0 && any(!is.na(pct_cis_trans_long$Percent))) {
print(p_pct_cis_trans)
}

# 5b) Percent TRANS per batch at T0 vs T120 (single plot; bars side-by-side)
pct_trans_time_compare <- agg_qc_combo4 %>%
filter(time_point %in% c(0, 120)) %>%
transmute(
batch,
time_point = factor(time_point, levels = c(0, 120), labels = c("T0", "T120")),
Percent_Trans,
Label = pct_label(Percent_Trans)
)
p_pct_trans_t0_t120 <- ggplot(pct_trans_time_compare, aes(x = batch, y = Percent_Trans, fill = time_point)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = Label),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 3.2
) +
scale_y_continuous(limits = c(0, 110)) +
theme_bw() +
labs(
title = "QC: TRANS% per batch (T0 vs T120)",
x = "Batch",
y = "TRANS% of total",
fill = "Time"
)
if (nrow(pct_trans_time_compare) > 0 && any(!is.na(pct_trans_time_compare$Percent_Trans))) {
print(p_pct_trans_t0_t120)
}

# 6) Within-class combo composition (stacked; should sum to 100% within class)
# Use ONLY the two combos per class (A_to_B/C_to_D within CIS; A_to_D/C_to_B within TRANS)
agg_combo_within_2only <- agg_qc_combo4
cis_combo_long <- agg_combo_within_2only %>%
select(batch, time_point, Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis) %>%
pivot_longer(
cols = c(Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis),
names_to = "Combo",
values_to = "Percent"
) %>%
mutate(
Combo = recode(
Combo,
Percent_A_to_B_in_Cis = "A_to_B (within CIS)",
Percent_C_to_D_in_Cis = "C_to_D (within CIS)"
)
)
p_cis_combo_comp <- ggplot(cis_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
geom_col(width = 0.75) +
geom_text(
aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
position = position_stack(vjust = 0.5),
size = 2.4
) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw() +
labs(
title = "QC: Within-CIS composition (A_to_B + C_to_D = 100%)",
x = "Batch",
y = "% of CIS",
fill = ""
)
if (nrow(cis_combo_long) > 0 && any(!is.na(cis_combo_long$Percent))) {
print(p_cis_combo_comp)
}

trans_combo_long <- agg_combo_within_2only %>%
select(batch, time_point, Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans) %>%
pivot_longer(
cols = c(Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans),
names_to = "Combo",
values_to = "Percent"
) %>%
mutate(
Combo = recode(
Combo,
Percent_A_to_D_in_Trans = "A_to_D (within TRANS)",
Percent_C_to_B_in_Trans = "C_to_B (within TRANS)"
)
)
p_trans_combo_comp <- ggplot(trans_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
geom_col(width = 0.75) +
geom_text(
aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
position = position_stack(vjust = 0.5),
size = 2.4
) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw() +
labs(
title = "QC: Within-TRANS composition (A_to_D + C_to_B = 100%)",
x = "Batch",
y = "% of TRANS",
fill = ""
)
if (nrow(trans_combo_long) > 0 && any(!is.na(trans_combo_long$Percent))) {
print(p_trans_combo_comp)
}

3. cis/trans normalization — compute totals and percentages per
allele
my_summarize_cistrans <- function(dat){
by_allele <- dat %>%
group_by(batch, time_point, DSB, allele) %>%
summarise(
Cis_Location_Counts = sum(count[cis_trans == "CIS"]),
Trans_Location_Counts = sum(count[cis_trans == "TRANS"]),
.groups = "drop"
)
totals <- by_allele %>%
group_by(batch, time_point, DSB) %>%
summarise(
Total_Cis_Location_Counts = sum(Cis_Location_Counts),
Total_Trans_Location_Counts = sum(Trans_Location_Counts),
.groups = "drop"
)
by_allele %>%
left_join(totals, by = c("batch", "time_point", "DSB")) %>%
mutate(
Percent_Location_in_Cis = if_else(Total_Cis_Location_Counts > 0,
100 * Cis_Location_Counts / Total_Cis_Location_Counts,
NA_real_),
Percent_Location_in_Trans = if_else(Total_Trans_Location_Counts > 0,
100 * Trans_Location_Counts / Total_Trans_Location_Counts,
NA_real_)
)
}
dat_norm <- my_summarize_cistrans(dat_raw)
dat_norm |> head(10) |> knitr::kable(caption = "Preview: allele-level CIS/TRANS counts and within-class percentages (first 10)")
Preview: allele-level CIS/TRANS counts and within-class
percentages (first 10)
| 4 |
0 |
DSB1 |
Chr12_L01 |
23365 |
0 |
499775 |
0 |
4.6751038 |
NA |
| 4 |
0 |
DSB1 |
Chr12_L05 |
15119 |
0 |
499775 |
0 |
3.0251613 |
NA |
| 4 |
0 |
DSB1 |
Chr12_L07 |
19481 |
0 |
499775 |
0 |
3.8979541 |
NA |
| 4 |
0 |
DSB1 |
Chr12_L12 |
22779 |
0 |
499775 |
0 |
4.5578510 |
NA |
| 4 |
0 |
DSB1 |
Chr15_L01 |
27868 |
0 |
499775 |
0 |
5.5761092 |
NA |
| 4 |
0 |
DSB1 |
Chr15_L04 |
54 |
0 |
499775 |
0 |
0.0108049 |
NA |
| 4 |
0 |
DSB1 |
Chr15_L15 |
14456 |
0 |
499775 |
0 |
2.8925016 |
NA |
| 4 |
0 |
DSB1 |
Chr15_L17_2 |
28338 |
0 |
499775 |
0 |
5.6701516 |
NA |
| 4 |
0 |
DSB1 |
Chr15_L18 |
4689 |
0 |
499775 |
0 |
0.9382222 |
NA |
| 4 |
0 |
DSB1 |
Chr3_L02 |
26669 |
0 |
499775 |
0 |
5.3362013 |
NA |
3.5 Uncuttable-normalized allele combo profiles
For this experiment, we normalize counts by the
uncuttable/control signal within each (batch ×
time_point × DSB).
This section produces:
- A table of per-allele, per-combo counts normalized to
uncuttable.
- A plot of % of uncuttable by allele, colored by
combo, for each batch and time.
# Define how to identify NO_CUT / uncuttable reads.
# IMPORTANT: in this experiment we also have rows like "1PERCENTCONTROL".
# We therefore:
# 1) match NO_CUT labels fairly broadly (to catch variants like DSB2_NO_CUT_SSA), and
# 2) explicitly EXCLUDE any 1% spike-in control rows from the NO_CUT denominator.
uncuttable_pattern <- "NO[_\\W]*CUT|NOCUT|UNCUT"
uncuttable_regex <- stringr::regex(uncuttable_pattern, ignore_case = TRUE)
# The 1% spike-in appears in labels like "CHRXV_L18_1PERCENT_CONTROL".
percent_control_regex <- stringr::regex("1PERCENT|PERCENTCONTROL|PERCENT_CONTROL", ignore_case = TRUE)
# Determine which column actually contains NO_CUT labels.
# Different runs/pipelines sometimes store these labels in different fields.
# Include both character and factor columns.
char_like_cols <- names(dat_raw)[vapply(dat_raw, function(x) is.character(x) || is.factor(x), logical(1))]
if (length(char_like_cols) == 0) {
stop("No character/factor columns found in dat_raw; cannot locate NO_CUT labels.")
}
nocut_field_scores <- tibble(
field = char_like_cols,
n_matched = vapply(
char_like_cols,
function(col) {
v <- as.character(dat_raw[[col]])
sum(stringr::str_detect(v, uncuttable_regex) & !stringr::str_detect(v, percent_control_regex), na.rm = TRUE)
},
numeric(1)
)
) %>%
arrange(desc(n_matched))
nocut_field <- nocut_field_scores$field[[1]]
nocut_field_scores %>%
head(10) %>%
knitr::kable(caption = "Diagnostic: candidate fields for NO_CUT labels (counts of matching rows; top 10).")
Diagnostic: candidate fields for NO_CUT labels (counts of
matching rows; top 10).
| alignment_name |
78 |
| allele |
78 |
| batch |
0 |
| DSB2_loci |
0 |
| cis_trans |
0 |
| DSB |
0 |
| combo |
0 |
| repeat |
0 |
| source_file |
0 |
if (is.na(nocut_field_scores$n_matched[[1]]) || nocut_field_scores$n_matched[[1]] == 0) {
warning(
"No NO_CUT rows detected in any character field using pattern '",
uncuttable_pattern,
"'. Normalized values will be NA/0. Check how NO_CUT is labeled in your input CSVs."
)
}
# Focus on the 4 key combos used throughout the report.
combos_4 <- c("A_to_B", "C_to_D", "A_to_D", "C_to_B")
# Aggregate to allele × combo within each batch/time/DSB
dat_allele_combo_counts <- dat_raw %>%
filter(combo %in% combos_4) %>%
group_by(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
summarise(Counts = sum(count, na.rm = TRUE), .groups = "drop") %>%
mutate(combo = factor(combo, levels = combos_4))
# Compute DSB-specific NO_CUT denominator per batch/time/DSB.
# If NO_CUT exists in multiple categories/classes for a given DSB, this sums them,
# yielding the DSB-specific NO_CUT total within each (batch × time_point × DSB).
dat_uncuttable_denoms <- dat_raw %>%
group_by(batch, time_point, replicate, DSB, `repeat`) %>%
summarise(
Uncuttable_Counts = sum(
if_else(
stringr::str_detect(.data[[nocut_field]], uncuttable_regex) &
!stringr::str_detect(.data[[nocut_field]], percent_control_regex),
count,
0
),
na.rm = TRUE
),
.groups = "drop"
)
# Diagnostic: how many groups have a usable NO_CUT denominator?
dat_uncuttable_denoms %>%
summarise(
Groups = n(),
Groups_with_NO_CUT = sum(Uncuttable_Counts > 0, na.rm = TRUE),
Groups_with_NO_CUT_zero = sum(Uncuttable_Counts <= 0, na.rm = TRUE),
Min_NO_CUT = suppressWarnings(min(Uncuttable_Counts, na.rm = TRUE)),
Median_NO_CUT = suppressWarnings(median(Uncuttable_Counts, na.rm = TRUE)),
Max_NO_CUT = suppressWarnings(max(Uncuttable_Counts, na.rm = TRUE))
) %>%
knitr::kable(caption = "Diagnostic: summary of NO_CUT denominators across batch × time_point × DSB groups")
Diagnostic: summary of NO_CUT denominators across batch ×
time_point × DSB groups
| 42 |
42 |
0 |
20 |
8441 |
127246 |
# If denominators are all zero, show candidate values containing CUT/NOCUT/UNCUT to help identify labeling.
if (all(dat_uncuttable_denoms$Uncuttable_Counts <= 0, na.rm = TRUE)) {
dat_raw %>%
mutate(.nocut_field_value = as.character(.data[[nocut_field]])) %>%
filter(stringr::str_detect(.nocut_field_value, stringr::regex("CUT|NOCUT|NO[_\\W]*CUT|UNCUT|INTACT", ignore_case = TRUE))) %>%
count(Value = .nocut_field_value, sort = TRUE, name = "Row_Count") %>%
head(30) %>%
knitr::kable(caption = paste0(
"No positive NO_CUT denominators found. Here are common values containing CUT/NOCUT/UNCUT/INTACT in field '",
nocut_field,
"' (first 30). Use this to refine the NO_CUT matcher."
))
}
# Diagnostic: list the most common allele labels contributing to NO_CUT denominators.
dat_raw %>%
filter(stringr::str_detect(.data[[nocut_field]], uncuttable_regex), !stringr::str_detect(.data[[nocut_field]], percent_control_regex)) %>%
count(Value = .data[[nocut_field]], wt = count, sort = TRUE, name = "Total_Counts") %>%
head(20) %>%
knitr::kable(caption = paste0(
"Top allele labels matched as NO_CUT (weighted by count; first 20). ",
"Matching field used: ", nocut_field, ". ",
"If this is empty, the NO_CUT matcher is not finding your labels."
))
Top allele labels matched as NO_CUT (weighted by count; first
20). Matching field used: alignment_name. If this is empty, the NO_CUT
matcher is not finding your labels.
| CIS_DSB1_NO_REPEAT_NO_CUT |
241530 |
| CIS_A_to_B_DSB1_UNCUTTABLE |
195175 |
| CIS_C_to_D_DSB2_UNCUTTABLE |
121969 |
| CIS_DSB2_NO_REPEAT_NO_CUT |
111129 |
| CIS_DSB1_FULL_NO_CUT |
71311 |
| CIS_DSB2_FULL_NO_CUT |
41151 |
| TRANS_A_to_D_UNCUTTABLE |
5025 |
| TRANS_DSB1A_DSB2D_NO_REPEAT_DSB1_NO_CUT |
2065 |
| TRANS_DSB1A_DSB2D_FULL_DSB1_NO_CUT |
1130 |
| TRANS_DSB2C_DSB1B_NO_REPEAT_DSB2_NO_CUT |
338 |
| TRANS_DSB2C_DSB1B_FULL_DSB2_NO_CUT |
251 |
| CIS_DSB2_FULL_DSB1_NO_CUT |
248 |
| TRANS_C_to_B_UNCUTTABLE |
209 |
| CIS_DSB2_NO_REPEAT_DSB1_NO_CUT |
116 |
| CIS_DSB1_FULL_DSB2_NO_CUT |
66 |
| CIS_DSB1_NO_REPEAT_DSB2_NO_CUT |
46 |
| TRANS_DSB1A_DSB2D_NO_REPEAT_DSB2_NO_CUT |
9 |
| TRANS_DSB2C_DSB1B_NO_REPEAT_DSB1_NO_CUT |
3 |
| TRANS_DSB2C_DSB1B_FULL_DSB1_NO_CUT |
2 |
| TRANS_DSB1A_DSB2D_FULL_DSB2_NO_CUT |
1 |
# Diagnostic: show the actual NO_CUT denominators being used.
# This confirms, for example, that Batch 4 DSB1 is normalized by Batch 4 DSB1 NO_CUT counts (per time_point).
dat_uncuttable_denoms %>%
arrange(batch, time_point, replicate, DSB, `repeat`) %>%
head(20) %>%
knitr::kable(caption = "Uncuttable denominators (NO_CUT): Uncuttable_Counts per batch × time_point × replicate × DSB × repeat-class (first 20 rows)")
Uncuttable denominators (NO_CUT): Uncuttable_Counts per batch ×
time_point × replicate × DSB × repeat-class (first 20 rows)
| 4 |
0 |
1 |
DSB1 |
ALL |
28178 |
| 4 |
0 |
1 |
DSB2 |
ALL |
8847 |
| 4 |
0 |
1 |
TRANS |
ALL |
115 |
| 4 |
120 |
1 |
DSB1 |
ALL |
103838 |
| 4 |
120 |
1 |
DSB2 |
ALL |
23505 |
| 4 |
120 |
1 |
TRANS |
ALL |
2106 |
| 6 |
0 |
1 |
DSB1 |
ALL |
13004 |
| 6 |
0 |
1 |
DSB1 |
INTACT |
23997 |
| 6 |
0 |
1 |
DSB1 |
SSA |
15119 |
| 6 |
0 |
1 |
DSB2 |
ALL |
2571 |
| 6 |
0 |
1 |
DSB2 |
INTACT |
21258 |
| 6 |
0 |
1 |
DSB2 |
SSA |
10877 |
| 6 |
0 |
1 |
TRANS |
ALL |
482 |
| 6 |
0 |
1 |
TRANS |
INTACT |
524 |
| 6 |
0 |
1 |
TRANS |
SSA |
140 |
| 6 |
120 |
1 |
DSB1 |
ALL |
20548 |
| 6 |
120 |
1 |
DSB2 |
ALL |
2325 |
| 6 |
120 |
1 |
TRANS |
ALL |
731 |
| 6 |
180 |
1 |
DSB1 |
INTACT |
22097 |
| 6 |
180 |
1 |
DSB1 |
SSA |
91176 |
if (any(dat_uncuttable_denoms$Uncuttable_Counts <= 0, na.rm = TRUE)) {
message("Note: Some batch × time_point × DSB groups have Uncuttable_Counts = 0; Percent_of_Uncuttable will be NA for those groups.")
}
# Normalize each allele×combo count by the uncuttable denominator for that DSB group
dat_uncut_norm <- dat_allele_combo_counts %>%
left_join(dat_uncuttable_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
mutate(
Percent_of_Uncuttable = if_else(Uncuttable_Counts > 0,
100 * Counts / Uncuttable_Counts,
NA_real_)
)
# Diagnostic: it is possible (and sometimes expected) for % of NO_CUT to exceed 100,
# because this metric is a ratio to the NO_CUT signal, not a within-group probability.
# Still, values >>100 can also indicate a very small NO_CUT denominator or a labeling mismatch.
over_100 <- dat_uncut_norm %>%
filter(!is.na(Percent_of_Uncuttable), Percent_of_Uncuttable > 100) %>%
arrange(desc(Percent_of_Uncuttable))
if (nrow(over_100) == 0) {
message("No allele×combo rows exceed 100% of NO_CUT.")
} else {
over_100 %>%
transmute(
batch,
time_point,
DSB,
allele,
combo,
Counts,
Uncuttable_Counts,
Percent_of_Uncuttable = round(Percent_of_Uncuttable, 2)
) %>%
head(25) %>%
knitr::kable(caption = "Rows where % of NO_CUT > 100 (top 25). Large values usually mean NO_CUT counts are low for that batch/time/DSB.")
}
Rows where % of NO_CUT > 100 (top 25). Large values usually
mean NO_CUT counts are low for that batch/time/DSB.
| 8 |
0 |
DSB2 |
COMMON |
C_to_D |
1044235 |
3550 |
29415.07 |
| 8 |
180 |
DSB2 |
COMMON |
C_to_D |
527722 |
5283 |
9989.06 |
| 6 |
0 |
DSB2 |
COMMON |
C_to_D |
1715641 |
21258 |
8070.57 |
| 6 |
180 |
TRANS |
COMMON |
C_to_B |
52528 |
781 |
6725.74 |
| 6 |
0 |
DSB2 |
Chr4 |
C_to_D |
164802 |
2571 |
6410.04 |
| 4 |
0 |
DSB2 |
Chr7 |
C_to_D |
449843 |
8847 |
5084.70 |
| 6 |
180 |
DSB2 |
COMMON |
C_to_D |
563623 |
11308 |
4984.29 |
| 8 |
180 |
TRANS |
COMMON |
C_to_B |
70029 |
1474 |
4750.95 |
| 8 |
0 |
DSB2 |
Chr15 |
C_to_D |
213677 |
4936 |
4328.95 |
| 6 |
180 |
TRANS |
COMMON |
C_to_B |
14469 |
366 |
3953.28 |
| 8 |
180 |
DSB2 |
COMMON |
C_to_D |
1155167 |
34704 |
3328.63 |
| 6 |
120 |
DSB2 |
Chr4 |
C_to_D |
68728 |
2325 |
2956.04 |
| 8 |
180 |
TRANS |
CHRXV_L01 |
A_to_D |
34978 |
1474 |
2373.00 |
| 8 |
180 |
TRANS |
COMMON |
C_to_B |
9081 |
421 |
2157.01 |
| 6 |
180 |
TRANS |
CHRIV_L10_2 |
A_to_D |
15870 |
781 |
2032.01 |
| 4 |
120 |
DSB2 |
Chr7 |
C_to_D |
460116 |
23505 |
1957.52 |
| 6 |
180 |
TRANS |
CHRIII_L04 |
A_to_D |
14606 |
781 |
1870.17 |
| 6 |
180 |
DSB2 |
COMMON |
C_to_D |
1117160 |
64040 |
1744.47 |
| 6 |
180 |
TRANS |
CHRXII_L05 |
A_to_D |
13382 |
781 |
1713.44 |
| 6 |
180 |
TRANS |
CHRXV_L01 |
A_to_D |
12888 |
781 |
1650.19 |
| 8 |
180 |
TRANS |
CHRXII_L12 |
A_to_D |
19413 |
1474 |
1317.03 |
| 8 |
180 |
TRANS |
CHRXV_L01 |
A_to_D |
5302 |
421 |
1259.38 |
| 6 |
180 |
TRANS |
CHRXII_L07 |
A_to_D |
9125 |
781 |
1168.37 |
| 6 |
180 |
TRANS |
CHRIII_L03 |
A_to_D |
7997 |
781 |
1023.94 |
| 6 |
0 |
DSB1 |
CHRIII_L04 |
A_to_B |
228463 |
23997 |
952.05 |
# ---- Group-level CIS/TRANS after NO_CUT normalization ----
# Note: NO_CUT normalization rescales counts within each (batch × time_point × DSB).
# That means CIS and TRANS *fractions of total* are unchanged, but CIS/TRANS as
# a percent of NO_CUT are newly normalized and can be compared across groups.
cis_combos <- c("A_to_B", "C_to_D")
trans_combos <- c("A_to_D", "C_to_B")
dat_group_combo_counts <- dat_raw %>%
filter(combo %in% combos_4) %>%
group_by(batch, time_point, replicate, DSB, `repeat`, combo) %>%
summarise(Counts = sum(count, na.rm = TRUE), .groups = "drop") %>%
mutate(combo = factor(combo, levels = combos_4))
dat_group_cistrans_norm <- dat_group_combo_counts %>%
group_by(batch, time_point, replicate, DSB, `repeat`) %>%
summarise(
Cis_Counts_4 = sum(Counts[combo %in% cis_combos], na.rm = TRUE),
Trans_Counts_4 = sum(Counts[combo %in% trans_combos], na.rm = TRUE),
.groups = "drop"
) %>%
left_join(dat_uncuttable_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
mutate(
Percent_Cis_of_NO_CUT = if_else(Uncuttable_Counts > 0, 100 * Cis_Counts_4 / Uncuttable_Counts, NA_real_),
Percent_Trans_of_NO_CUT = if_else(Uncuttable_Counts > 0, 100 * Trans_Counts_4 / Uncuttable_Counts, NA_real_)
)
dat_group_cistrans_norm %>%
arrange(batch, time_point, replicate, DSB, `repeat`) %>%
head(20) %>%
knitr::kable(caption = "Preview: CIS and TRANS (4-combo) normalized to NO_CUT (% of NO_CUT; first 20 rows)")
Preview: CIS and TRANS (4-combo) normalized to NO_CUT (% of
NO_CUT; first 20 rows)
| 4 |
0 |
1 |
DSB1 |
ALL |
499775 |
0 |
28178 |
1773.6355 |
0.000 |
| 4 |
0 |
1 |
DSB2 |
ALL |
458690 |
0 |
8847 |
5184.6954 |
0.000 |
| 4 |
0 |
1 |
TRANS |
ALL |
0 |
3754 |
115 |
0.0000 |
3264.348 |
| 4 |
120 |
1 |
DSB1 |
ALL |
538320 |
0 |
103838 |
518.4229 |
0.000 |
| 4 |
120 |
1 |
DSB2 |
ALL |
483621 |
0 |
23505 |
2057.5239 |
0.000 |
| 4 |
120 |
1 |
TRANS |
ALL |
0 |
31673 |
2106 |
0.0000 |
1503.941 |
| 6 |
0 |
1 |
DSB1 |
ALL |
223186 |
0 |
13004 |
1716.2873 |
0.000 |
| 6 |
0 |
1 |
DSB1 |
INTACT |
1193579 |
0 |
23997 |
4973.8676 |
0.000 |
| 6 |
0 |
1 |
DSB1 |
SSA |
44570 |
0 |
15119 |
294.7946 |
0.000 |
| 6 |
0 |
1 |
DSB2 |
ALL |
167373 |
0 |
2571 |
6510.0350 |
0.000 |
| 6 |
0 |
1 |
DSB2 |
INTACT |
1741835 |
0 |
21258 |
8193.7859 |
0.000 |
| 6 |
0 |
1 |
DSB2 |
SSA |
35803 |
0 |
10877 |
329.1625 |
0.000 |
| 6 |
0 |
1 |
TRANS |
ALL |
0 |
11169 |
482 |
0.0000 |
2317.220 |
| 6 |
0 |
1 |
TRANS |
INTACT |
0 |
6013 |
524 |
0.0000 |
1147.519 |
| 6 |
0 |
1 |
TRANS |
SSA |
0 |
1434 |
140 |
0.0000 |
1024.286 |
| 6 |
120 |
1 |
DSB1 |
ALL |
169434 |
0 |
20548 |
824.5766 |
0.000 |
| 6 |
120 |
1 |
DSB2 |
ALL |
71053 |
0 |
2325 |
3056.0430 |
0.000 |
| 6 |
120 |
1 |
TRANS |
ALL |
0 |
12644 |
731 |
0.0000 |
1729.685 |
| 6 |
180 |
1 |
DSB1 |
INTACT |
504314 |
0 |
22097 |
2282.2736 |
0.000 |
| 6 |
180 |
1 |
DSB1 |
SSA |
1045209 |
0 |
91176 |
1146.3642 |
0.000 |
dat_uncut_norm |>
arrange(batch, time_point, replicate, DSB, `repeat`, allele, combo) |>
head(12) |>
knitr::kable(caption = "Preview: allele×combo counts normalized to uncuttable (% of uncuttable; first 12 rows)")
Preview: allele×combo counts normalized to uncuttable (% of
uncuttable; first 12 rows)
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L01 |
A_to_B |
23365 |
28178 |
82.9192987 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L05 |
A_to_B |
15119 |
28178 |
53.6553339 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L07 |
A_to_B |
19481 |
28178 |
69.1354958 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L12 |
A_to_B |
22779 |
28178 |
80.8396621 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L01 |
A_to_B |
27868 |
28178 |
98.8998509 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L04 |
A_to_B |
54 |
28178 |
0.1916389 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L15 |
A_to_B |
14456 |
28178 |
51.3024345 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L17_2 |
A_to_B |
28338 |
28178 |
100.5678189 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L18 |
A_to_B |
4689 |
28178 |
16.6406416 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L02 |
A_to_B |
26669 |
28178 |
94.6447583 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L03 |
A_to_B |
19 |
28178 |
0.0674285 |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L04 |
A_to_B |
38319 |
28178 |
135.9890695 |
# Plot: percent-of-uncuttable by allele, per batch.
# Requested: make DSB1 and DSB2 separate plots (and also include TRANS), and add data labels.
plot_df <- dat_uncut_norm %>%
filter(!is.na(Percent_of_Uncuttable)) %>%
mutate(
DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")),
Label = if_else(
is.na(Percent_of_Uncuttable),
NA_character_,
paste0(round(Percent_of_Uncuttable, 1), "%")
)
)
if (nrow(plot_df) == 0) {
message("No uncuttable-normalized values to plot (Percent_of_Uncuttable is all NA).")
} else {
batches <- plot_df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- plot_df %>% filter(batch == b)
if (nrow(df_b) == 0) next
for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
df_bd <- df_b %>% filter(DSB == dsb_name)
if (nrow(df_bd) == 0) next
p <- ggplot(df_bd, aes(x = allele, y = Percent_of_Uncuttable, fill = combo)) +
geom_col(position = position_dodge(width = 0.9), width = 0.85) +
geom_text(
aes(label = Label),
position = position_dodge(width = 0.9),
vjust = -0.25,
size = 2.6,
angle = 90
) +
facet_grid(`repeat` ~ time_point, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(
title = paste0("Uncuttable-normalized allele×combo (% of NO_CUT) | Batch: ", b, " | ", dsb_name),
x = "Allele",
y = paste0("% of NO_CUT (within batch × time_point × ", dsb_name, ")"),
fill = "Combo"
)
print(p)
}
}
}









# Plot: CIS% of NO_CUT and TRANS% of NO_CUT (group-level), separately for DSB1, DSB2, and TRANS
plot_ct <- dat_group_cistrans_norm %>%
filter(!is.na(Percent_Cis_of_NO_CUT) | !is.na(Percent_Trans_of_NO_CUT)) %>%
pivot_longer(
cols = c(Percent_Cis_of_NO_CUT, Percent_Trans_of_NO_CUT),
names_to = "Class",
values_to = "Percent"
) %>%
mutate(
Class = recode(Class,
Percent_Cis_of_NO_CUT = "CIS",
Percent_Trans_of_NO_CUT = "TRANS"),
Label = if_else(is.na(Percent), NA_character_, paste0(round(Percent, 1), "%")),
DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))
)
if (nrow(plot_ct) == 0) {
message("No group-level CIS/TRANS NO_CUT-normalized values to plot.")
} else {
batches_ct <- plot_ct %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches_ct) {
df_b <- plot_ct %>% filter(batch == b)
if (nrow(df_b) == 0) next
for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
df_bd <- df_b %>% filter(DSB == dsb_name)
if (nrow(df_bd) == 0) next
p <- ggplot(df_bd, aes(x = factor(time_point), y = Percent, fill = Class)) +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
geom_text(
aes(label = Label),
position = position_dodge(width = 0.9),
vjust = -0.25,
size = 3.0
) +
theme_bw() +
labs(
title = paste0("CIS% and TRANS% of NO_CUT (4-combo totals) | Batch: ", b, " | ", dsb_name),
x = "Time point",
y = "% of NO_CUT",
fill = ""
)
print(p)
}
}
}









3.6 Two-step normalization: NO_CUT baseline + 1% spike-in
anchor
This section implements the two-step scheme discussed:
- Normalize each feature to its DSB-specific NO_CUT
total (background/intact baseline)
- Scale those NO_CUT-normalized ratios using the 1%
spike-in in the same DSB and repeat-class to approximate a
per-cell-like quantity.
For a feature count \(C\) in a given
(batch × time × replicate × DSB × repeat-class), and the DSB-specific
NO_CUT total \(NC\):
\[R_{\text{NOCUT}} =
\frac{C}{NC}\]
In these PD summary CSVs, the 1% control appears as an internal
spike-in allele (e.g. alleles containing
1PERCENT_CONTROL). Let \(S\) be the total spike-in counts in the
same (DSB × repeat-class). A useful per-cell-like scaling is:
\[R_{\text{final}} = \frac{C}{S}
\quad\text{and}\quad R_{\text{final,per cell}} =
0.01\cdot\frac{C}{S}\]
This corresponds to: normalize to NO_CUT, then rescale by the
spike-in in that DSB/class (algebraically, the NO_CUT denominator
cancels when both are taken in the same group).
# Two-step scheme for the PD CSVs:
# - NO_CUT is represented as allele == "NO_CUT" (split by repeat-class and DSB)
# - The 1% spike-in is represented as alleles like "*_1PERCENT_CONTROL" (also split by repeat-class and DSB)
#
# For each (batch × time_point × replicate × DSB × repeat-class):
# Step 1: R_NOCUT = C_feat / C_NOCUT_total
# Step 2: scale using spike-in: PerCell ~= 0.01 * C_feat / C_spike
# (equivalently: 0.01 * R_NOCUT / (C_spike / C_NOCUT_total))
if (!exists("uncuttable_regex")) {
uncuttable_pattern <- "NO[_\\W]*CUT|NOCUT|UNCUT"
uncuttable_regex <- stringr::regex(uncuttable_pattern, ignore_case = TRUE)
}
if (!exists("percent_control_regex")) {
percent_control_regex <- stringr::regex("1PERCENT|PERCENTCONTROL|PERCENT_CONTROL", ignore_case = TRUE)
}
combos_4 <- c("A_to_B", "C_to_D", "A_to_D", "C_to_B")
df_lab <- dat_raw %>%
mutate(
.allele_chr = as.character(allele),
.aln_chr = as.character(alignment_name),
is_spike = stringr::str_detect(.allele_chr, percent_control_regex) | stringr::str_detect(.aln_chr, percent_control_regex),
is_nocut_raw = stringr::str_detect(.allele_chr, uncuttable_regex) | stringr::str_detect(.aln_chr, stringr::regex("NO[_\\W]*CUT|NOCUT", ignore_case = TRUE))
)
# Heuristic to avoid counting cross-DSB NO_CUT rows (e.g. "DSB2_NO_CUT" entries under DSB1)
df_lab <- df_lab %>%
mutate(
is_nocut = case_when(
DSB == "DSB1" ~ is_nocut_raw & !stringr::str_detect(.aln_chr, stringr::regex("DSB2[_\\W]*NO[_\\W]*CUT", ignore_case = TRUE)),
DSB == "DSB2" ~ is_nocut_raw & !stringr::str_detect(.aln_chr, stringr::regex("DSB1[_\\W]*NO[_\\W]*CUT", ignore_case = TRUE)),
TRUE ~ is_nocut_raw
)
)
df_lab %>%
summarise(
Rows = n(),
Rows_spike = sum(is_spike, na.rm = TRUE),
Rows_nocut = sum(is_nocut, na.rm = TRUE)
) %>%
knitr::kable(caption = "Diagnostic: detected NO_CUT and 1% spike-in rows in dat_raw")
Diagnostic: detected NO_CUT and 1% spike-in rows in
dat_raw
| 1107 |
23 |
63 |
# Totals per group
group_denoms <- df_lab %>%
group_by(batch, time_point, replicate, DSB, `repeat`) %>%
summarise(
C_NOCUT_total = sum(if_else(is_nocut & !is_spike, count, 0), na.rm = TRUE),
C_spike_total = sum(if_else(is_spike, count, 0), na.rm = TRUE),
.groups = "drop"
)
group_denoms %>%
summarise(
Groups = n(),
Groups_with_NOCUT = sum(C_NOCUT_total > 0, na.rm = TRUE),
Groups_with_spike = sum(C_spike_total > 0, na.rm = TRUE)
) %>%
knitr::kable(caption = "Diagnostic: denominator availability by (batch × time × replicate × DSB × repeat-class)")
Diagnostic: denominator availability by (batch × time ×
replicate × DSB × repeat-class)
| 42 |
42 |
21 |
group_denoms %>%
arrange(batch, time_point, replicate, DSB, `repeat`) %>%
head(20) %>%
knitr::kable(caption = "Preview: NO_CUT totals and spike-in totals (first 20 groups)")
Preview: NO_CUT totals and spike-in totals (first 20
groups)
| 4 |
0 |
1 |
DSB1 |
ALL |
28178 |
0 |
| 4 |
0 |
1 |
DSB2 |
ALL |
8847 |
0 |
| 4 |
0 |
1 |
TRANS |
ALL |
115 |
0 |
| 4 |
120 |
1 |
DSB1 |
ALL |
103838 |
0 |
| 4 |
120 |
1 |
DSB2 |
ALL |
23505 |
0 |
| 4 |
120 |
1 |
TRANS |
ALL |
2106 |
0 |
| 6 |
0 |
1 |
DSB1 |
ALL |
13004 |
0 |
| 6 |
0 |
1 |
DSB1 |
INTACT |
23961 |
7649 |
| 6 |
0 |
1 |
DSB1 |
SSA |
15117 |
187 |
| 6 |
0 |
1 |
DSB2 |
ALL |
2571 |
0 |
| 6 |
0 |
1 |
DSB2 |
INTACT |
21082 |
33 |
| 6 |
0 |
1 |
DSB2 |
SSA |
10874 |
0 |
| 6 |
0 |
1 |
TRANS |
ALL |
482 |
0 |
| 6 |
0 |
1 |
TRANS |
INTACT |
524 |
31 |
| 6 |
0 |
1 |
TRANS |
SSA |
140 |
9 |
| 6 |
120 |
1 |
DSB1 |
ALL |
20548 |
0 |
| 6 |
120 |
1 |
DSB2 |
ALL |
2325 |
0 |
| 6 |
120 |
1 |
TRANS |
ALL |
731 |
0 |
| 6 |
180 |
1 |
DSB1 |
INTACT |
22085 |
3927 |
| 6 |
180 |
1 |
DSB1 |
SSA |
91150 |
11342 |
# Feature counts (allele × combo)
feat_counts <- df_lab %>%
filter(combo %in% combos_4) %>%
group_by(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
summarise(C_feat = sum(count, na.rm = TRUE), .groups = "drop")
feat_norm <- feat_counts %>%
left_join(group_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
mutate(
R_NOCUT = if_else(C_NOCUT_total > 0, C_feat / C_NOCUT_total, NA_real_),
Spike_per_NOCUT = if_else(C_NOCUT_total > 0, C_spike_total / C_NOCUT_total, NA_real_),
R_final = if_else(C_spike_total > 0, C_feat / C_spike_total, NA_real_),
R_final_per_cell = 0.01 * R_final
)
feat_norm %>%
arrange(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
head(20) %>%
knitr::kable(caption = "Preview: two-step outputs. R_NOCUT = C_feat / NO_CUT_total; R_final = C_feat / spike_total; R_final_per_cell = 0.01 * R_final (first 20).")
Preview: two-step outputs. R_NOCUT = C_feat / NO_CUT_total;
R_final = C_feat / spike_total; R_final_per_cell = 0.01 * R_final (first
20).
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L01 |
A_to_B |
23365 |
28178 |
0 |
0.8291930 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L05 |
A_to_B |
15119 |
28178 |
0 |
0.5365533 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L07 |
A_to_B |
19481 |
28178 |
0 |
0.6913550 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr12_L12 |
A_to_B |
22779 |
28178 |
0 |
0.8083966 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L01 |
A_to_B |
27868 |
28178 |
0 |
0.9889985 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L04 |
A_to_B |
54 |
28178 |
0 |
0.0019164 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L15 |
A_to_B |
14456 |
28178 |
0 |
0.5130243 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L17_2 |
A_to_B |
28338 |
28178 |
0 |
1.0056782 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr15_L18 |
A_to_B |
4689 |
28178 |
0 |
0.1664064 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L02 |
A_to_B |
26669 |
28178 |
0 |
0.9464476 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L03 |
A_to_B |
19 |
28178 |
0 |
0.0006743 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr3_L04 |
A_to_B |
38319 |
28178 |
0 |
1.3598907 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L01 |
A_to_B |
16192 |
28178 |
0 |
0.5746327 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L03 |
A_to_B |
19387 |
28178 |
0 |
0.6880190 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L06_2 |
A_to_B |
18430 |
28178 |
0 |
0.6540564 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L10_2 |
A_to_B |
45340 |
28178 |
0 |
1.6090567 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L12_2 |
A_to_B |
22582 |
28178 |
0 |
0.8014054 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L13 |
A_to_B |
19743 |
28178 |
0 |
0.7006530 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr4_L15_2 |
A_to_B |
21694 |
28178 |
0 |
0.7698914 |
0 |
NA |
NA |
| 4 |
0 |
1 |
DSB1 |
ALL |
Chr7_990K |
A_to_B |
15400 |
28178 |
0 |
0.5465257 |
0 |
NA |
NA |
# Plot: per-cell-like abundance scaled by spike-in
plot_final <- feat_norm %>%
filter(!is.na(R_final_per_cell)) %>%
filter(!stringr::str_detect(as.character(allele), percent_control_regex)) %>%
mutate(
DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")),
combo = factor(combo, levels = combos_4),
Label = if_else(is.na(R_final_per_cell), NA_character_, format(round(R_final_per_cell, 4), nsmall = 4))
)
if (nrow(plot_final) == 0) {
message("No R_final_per_cell values to plot. Check that spike-in rows (1PERCENT_CONTROL) and NO_CUT rows are present with nonzero counts.")
} else {
batches <- plot_final %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- plot_final %>% filter(batch == b)
for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
df_bd <- df_b %>% filter(DSB == dsb_name)
if (nrow(df_bd) == 0) next
p <- ggplot(df_bd, aes(x = allele, y = R_final_per_cell, fill = combo)) +
geom_col(position = position_dodge(width = 0.9), width = 0.85) +
geom_text(
aes(label = Label),
position = position_dodge(width = 0.9),
vjust = -0.25,
size = 2.4,
angle = 90
) +
facet_grid(`repeat` ~ time_point, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(
title = paste0("Two-step scaled per-cell-like abundance (0.01 * C_feat / spike_total) | Batch: ", b, " | ", dsb_name),
x = "Allele",
y = "Approx per-cell abundance (relative units)",
fill = "Combo"
)
print(p)
}
}
}





